%pylab inline
import os
import itertools
import time
from sklearn.feature_extraction import text
from sklearn import cross_validation
from sklearn import decomposition
from sklearn import neighbors
from sklearn import ensemble
from sklearn import feature_extraction
from sklearn import feature_selection
from sklearn import grid_search
from sklearn import metrics
from sklearn import naive_bayes
from sklearn import pipeline
from sklearn import tree
from sklearn import svm
from sklearn import linear_model
from sklearn import cluster
from sklearn.kernel_approximation import RBFSampler
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import log_loss
from sklearn.lda import LDA
from sklearn.learning_curve import learning_curve
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import sklearn
import seaborn as sns
from IPython.display import Image
import matplotlib.pyplot as plt
import prettyplotlib as ppl
import brewer2mpl
import IPython
import pandas as pd
import scipy.io
import math
import pybrain
import pkg_resources
print("Python version is {}".format(sys.version))
print("sklearn version = {}".format(sklearn.__version__))
print("pandas version = {}".format(pd.__version__))
print("scipy version = {}".format(scipy.__version__))
print("numpy version = {}".format(numpy.__version__))
print("matplotlib version = {}".format(matplotlib.__version__))
print("seaborn version = {}".format(sns.__version__))
print("brewer2mpl version = {}".format(brewer2mpl.__version__))
print("IPython version = {}".format(IPython.__version__))
from IPython.display import Image
Image(filename="images/Apple_Twitter.jpg")
A look into the sentiment around Apple, based on tweets containing #AAPL, @apple, etc.
Contributors were given a tweet and asked whether the user was positive, negative, or neutral about Apple. (They were also allowed to mark "the tweet is not about the company Apple, Inc.)
Tweets cover a wide array of topics including stock performance, new products, IP lawsuits, customer service at Apple stores, etc.
Note: Data containing need neurtal sentiments have been removed to make the dataset a binary classification dataset
Train dataset has 2500 rows and Test Dataset has 650 rows.
df = pd.read_csv('data/Apple_Twitter_Sentiment_Train.csv', index_col=0)
df.head()
X1 = df['text']
y = df['sentiment']
print X1.shape
print y.shape
Image(filename="images/Apple_Twitter_Train_WordCloud.png")
df_t = pd.read_csv('data/Apple_Twitter_Sentiment_Test.csv')
df_t.head()
Xt1 = df_t['text']
yt = df_t['sentiment']
print Xt1.shape
print yt.shape
Image(filename="images/Apple_Twitter_Test_WordCloud.png")
Comparing the word clouds of train and test dataset, it's clear that the top words are very different, making the test dataset a good candidate for testing.
vectorizer = TfidfVectorizer(min_df=5,
max_df = 0.8,
ngram_range = (1,3),
sublinear_tf=True,
stop_words='english',
use_idf=True)
X = vectorizer.fit_transform(X1)
Xt = vectorizer.transform(Xt1)
print X.shape
print Xt.shape
vectorizer.vocabulary_.get('great')
set2 = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors
font = {'family' : 'serif',
'color' : 'darkred',
'weight' : 'bold',
'size' : 16,
}
plt.rc('font',family='serif')
plt.rc('font', size=16)
plt.rc('font', weight='bold')
plt.style.use('fivethirtyeight')
# Get current size
fig_size = plt.rcParams["figure.figsize"]
# Set figure width to 6 and height to 6
fig_size[0] = 6
fig_size[1] = 6
plt.rcParams["figure.figsize"] = fig_size
df2 = pd.unique(df['sentiment'].values)
print "Total number of unique sentiment = ", df2.shape
print "="*66
print "Sentiment:\n"
print df2
df2 = df[['sentiment']]
byCandidate = df2.groupby('sentiment').describe()
print "="*66
print "Totals by sentiment:\n"
print byCandidate
bad , good = sum(df2.loc[:, 'sentiment'] == 0), sum(df2.loc[:, 'sentiment'] == 1)
print(bad , good)
from matplotlib import rcParams
rcParams['font.size'] = 18
blue = '#5A6FFA'
green = '#A3EB5B'
red = '#FF0000'
piechart = plt.pie(
(bad , good),
labels=('bad', 'good'),
shadow=True,
colors=(green, red),
explode=(0.06,0.06), # space between slices
startangle=90, # rotate conter-clockwise by 90 degrees
autopct='%1.1f%%',# display fraction as percentages
)
df3 = pd.unique(df_t['sentiment'].values)
print "Total number of unique sentiment = ", df3.shape
print "="*66
print "Sentiment:\n"
print df3
df3 = df_t[['sentiment']]
byCandidate = df3.groupby('sentiment').describe()
print "="*66
print "Totals by sentiment:\n"
print byCandidate
bad , good = sum(df3.loc[:, 'sentiment'] == 0), sum(df3.loc[:, 'sentiment'] == 1)
print(bad , good)
piechart = plt.pie(
(bad , good),
labels=('bad', 'good'),
shadow=True,
colors=(green, red),
explode=(0.06,0.06), # space between slices
startangle=90, # rotate conter-clockwise by 90 degrees
autopct='%1.1f%%',# display fraction as percentages
)
Training set was split into train and validation set during 10-fold cross-validation and 80/20 ratio on other metric measurement phases before extracting scores on the full training set. Only the final tuned model was used on the test set. Test set is based on a completely separate test csv file and only used for final predictions. Test set wasn’t used in any training. Final model is trained on original training set (without splitting of train and validation) so that all training samples used before prediction on the standalone test set. This trifurcation of sets helps keep final accuracy of model less biased towards the training set and accurately gauge the predictive power on the unknown future samples in production.
Cross-validation scores for all algorithms were based on 10-fold cross-validation; which has lower variance than a single hold-out validation set. Single hold-out validation is equivalent of one fold cross-validation which has less randomness than 10-fold, hence more bias in the data which would lead to a higher variance and an over- fitting final model. Cross-validation scores are based on mean of the scores from the 10-fold cross-validation. Accuracy and F1 Scores on training set are based on 10-fold cross-validation.
Image(filename="images/Type-I-and-II-errors.jpg")
Accuracy score is a ratio of (true positives / total number of samples). Accuracy score does not incorporate the fact of unbalanced dataset in the scoring schema. Using only the accuracy score in this analysis might lead to picking a wrong model. Hence, combinations of all the metrics were used to pick the final winners. Accuracy scores were based on 10-fold cross-validation.
The precision is the ratio of true positives/ (true positives + false positives) which is the measure of the classifier’s ability to not label negative samples as positive. The recall is the ratio of true positives/ (true positives + false negatives) which is the measure of classifiers’ ability to identify all true positives. F1 score is a combination of precision and recall scores, which is given by the formula F1 = 2 (precision recall) / (precision + recall). Given both the dataset’s used in this analysis are unbalanced and skewed (uneven number of rows per target class), precision and recall scores alone will not be good measure of accuracy. F1 score which is a harmonic mean between precision and recall takes into measure the fact of imbalanced classes. F1-scores were based on 10-fold cross-validation and this is the one of the most important metrics used since both datasets are unbalanced.
Confusion matrix helps decipher the accuracy of the model by a detailed breakdown of the correct and incorrect classifications for each target class. This is very helpful for this analysis since both the datasets used are unbalanced datasets. Confusion matrixes are shown for both training and test datasets.
def models(clf, title, name):
start = time.time()
print clf
print ""
# fit the model
clf.fit(X_train, y_train)
# make predictions
clf.predict(X_test)
# summarize the fit of the model
score = clf.score(X_train, y_train)
print "Classification score using train set: {}\n".format(str(score))
score = clf.score(X_test, y_test)
print "Classification score using test set: {}\n".format(str(score))
print"="*66
print ""
expected = y
predicted = clf.predict(X)
# print confusion matrix
cm = metrics.confusion_matrix(expected, predicted)
target_names = ['bad', 'good']
fig, ax = plt.subplots()
sns_cm = sns.heatmap(cm, annot=True, fmt='', xticklabels=target_names, yticklabels=target_names , ax=ax)
title = "Apple Tweets Training Dataset - " + title
ax.set_title(title, y=1.08, fontdict=font)
#fig.tight_layout()
#fig.savefig(name, bbox_inches='tight')
fig.show()
# train the model with whole training dataset now
clf.fit(X, y)
end = time.time()
total = end-start
minutes = total//60.0
seconds = total%60.0
print "Total running time for this model = {} seconds ({} minutes {} seconds)".format(total, minutes, seconds)
print ""
return
def models_test(clf, title, name):
print clf
print ""
# make predictions
expected = yt
predicted = clf.predict(Xt)
# summarize the fit of the model
score = clf.score(Xt, yt)
print "Classification score using test set: {}\n".format(str(score))
print"="*66
print ""
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print"="*66
# print confusion matrix
cm = metrics.confusion_matrix(expected, predicted)
target_names = ['bad', 'good']
fig, ax = plt.subplots()
sns_cm = sns.heatmap(cm, annot=True, fmt='', xticklabels=target_names, yticklabels=target_names , ax=ax)
title = "Apple Tweets Test Dataset - " + title
ax.set_title(title, y=1.08, fontdict=font)
#fig.tight_layout()
#fig.savefig(name, bbox_inches='tight')
fig.show()
def plot_learning_curve(estimator, title, name, ylim=None):
train_sizes=np.linspace(.1, 1.0, 20)
cv=None
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=-1, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
title = "Apple Tweets Dataset - " + title
plt.title(title, y=1.08, fontdict=font)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples", fontdict=font)
plt.ylabel("Score", fontdict=font)
plt.grid(True)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.plot(train_sizes, train_scores_mean, color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, color="g",
label="Cross-validation score")
plt.legend(bbox_to_anchor=(1.1, 1.05))
#plt.savefig(name, bbox_inches='tight')
plt.show()
return
def plot_learning_curve_ot(model, title, name):
training_error = []
test_error = []
mse = metrics.mean_squared_error
N_range = np.linspace(15, X_train.shape[0], 20).astype(int)
for N in N_range:
XN = X_train[:N]
yN = y_train[:N]
model.fit(XN, yN)
training_error.append(mse(model.predict(XN), yN))
test_error.append(mse(model.predict(X_test), y_test))
plt.plot(N_range, training_error, label='Training', color="r")
plt.plot(N_range, test_error, label='Test', color="g")
#plt.plot(N_range, np.ones_like(N_range), ':k')
title = "Apple Tweets Dataset - " + title
plt.title(title, y=1.08, fontdict=font)
plt.xlabel("Training examples", fontdict=font)
plt.ylabel("Error", fontdict=font)
plt.ylim(0,1)
plt.grid(True)
plt.legend(bbox_to_anchor=(1.1, 1.05))
#plt.savefig(name, bbox_inches='tight')
plt.show()
return
def plot_roc(model, title, name):
model.fit(X_train, y_train)
actuals = y_test
predictions = model.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(actuals, predictions)
roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
actuals_t = yt
predictions_t = model.predict(Xt)
false_positive_rate_t, true_positive_rate_t, thresholds_t = metrics.roc_curve(actuals_t, predictions_t)
roc_auc_t = metrics.auc(false_positive_rate_t, true_positive_rate_t)
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(false_positive_rate, true_positive_rate, 'r', label='Training = %0.2f'% roc_auc)
plt.plot(false_positive_rate_t, true_positive_rate_t, 'g', label='Test = %0.2f'% roc_auc_t)
#plt.fill(false_positive_rate, true_positive_rate, facecolor='blue', alpha=0.5, )
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontdict=font)
plt.ylabel('True Positive Rate', fontdict=font)
title = "Apple Tweets Dataset - " + title
plt.title(title, fontdict=font)
plt.legend(loc="lower right", fontsize=20)
#plt.savefig(name, bbox_inches='tight')
plt.show()
return roc_auc, roc_auc_t,
X = X.toarray()
Xt = Xt.toarray()
print X.shape
print y.shape
print Xt.shape
print yt.shape
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=42)
print X_train.shape
print y_train.shape
sample_weight = np.array([1 if i == 0 else 2 for i in y_train])
model = tree.DecisionTreeClassifier(criterion='entropy',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_samples_leaf=2, min_samples_split=2,
random_state=123, splitter='best')
models(model, "Decision Tree Before Pruning", "a_dt_b_cm_train.png")
plot_roc(model, "ROC/AUC Score (Decision Tree Before Pruning)", "a_dt_b_roc.png")
plot_learning_curve(model, "Learning Curve (Decision Tree Before Pruning)", "a_dt_b_lca.png", ylim=(0,1.05))
plot_learning_curve_ot(model, "Learning Curve (Decision Tree Before Pruning)", "a_dt_b_lce.png")
models_test(model, "Decision Tree Before Pruning", "a_dt_b_cm_test.png")
results = []
for n in range(1,24, 1):
clf = tree.DecisionTreeClassifier( criterion='entropy',
max_depth=24, max_features="auto", max_leaf_nodes=None,
min_samples_leaf=6, min_samples_split=n,
random_state=123, splitter='best')
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
accuracy = np.where(preds==y_test, 1, 0).sum() / float(len(X_test))
print "Decision Tree: %d, Accuracy: %3f" % (n, accuracy)
results.append([n, accuracy])
results = pd.DataFrame(results, columns=["n", "accuracy"])
plt.plot(results.n, results.accuracy)
plt.ylim([0.5,1])
plt.xlabel('Min Sample Split', fontdict=font)
plt.ylabel('Score', fontdict=font)
plt.title("Apple Tweets Dataset - Decision Tree Accuracy with Purning", y=1.08, fontdict=font)
#plt.savefig("a_dt_pruning.png", bbox_inches='tight')
plt.show()
model = tree.DecisionTreeClassifier( criterion='entropy',
max_depth=40, max_features="auto", max_leaf_nodes=None,
min_samples_leaf=4, min_samples_split=10,
random_state=123, splitter='best')
models(model, "Decision Tree After Pruning", "a_dt_a_cm_train.png")
dt_roc, dt_roct = plot_roc(model, "ROC/AUC Score (Decision Tree After Pruning)", "a_dt_a_roc.png")
plot_learning_curve(model, "Learning Curve (Decision Tree After Pruning)", "a_dt_a_lca.png", ylim=(0,1.05))
plot_learning_curve_ot(model, "Learning Curve (Decision Tree After Pruning)", "a_dt_a_lce.png")
models_test(model, "Decision Tree After Pruning", "a_dt_a_cm_test.png")
dt_train_start = time.time()
model.fit(X,y)
dt_train_end = time.time()
dt_train_time = dt_train_end - dt_train_start
print "Decision Tree Training time in seconds : %3f" % (dt_train_time)
print ""
decisiontree_model = model
print decisiontree_model
print ""
preds = decisiontree_model.predict(X_test)
#dt_validation_accuracy = np.where(preds==y_test, 1, 0).sum() / float(len(X_test))
scores = cross_validation.cross_val_score(model, X, y, cv=10)
dt_validation_accuracy = scores.mean()
print "Decision Tree Validation Set Accuracy: %3f" % (dt_validation_accuracy)
print ""
scores = cross_validation.cross_val_score(model, X, y, cv=10, scoring='f1')
dt_validation_f1 = scores.mean()
print "Decision Tree Validation Set f1: %3f" % (dt_validation_f1)
print ""
dt_predict_start = time.time()
preds = decisiontree_model.predict(Xt)
dt_predict_end = time.time()
dt_predict_time = dt_predict_end-dt_predict_start
print "Decision Tree Prediction time in seconds : %3f" % (dt_predict_time)
print ""
dt_test_accuracy = np.where(preds==yt, 1, 0).sum() / float(len(Xt))
print "Decision Tree Test Set Accuracy: %3f" % (dt_test_accuracy)
print ""
dt_test_f1 = metrics.f1_score(yt, preds, average="micro")
print "Decision Tree Test Set f1 Score: %3f" % (dt_test_accuracy)
print ""
model = neighbors.KNeighborsClassifier(n_neighbors=2, weights='uniform')
models(model, "KNN Euclidean Distance & 2 Neighbors", "a_knn_b_cm_train.png")
plot_roc(model, "ROC/AUC Score (KNN Euclidean Distance & 2 Neighbors)", "a_knn_b_roc.png")
plot_learning_curve(model, "Learning Curve (KNN Euclidean Distance & 2 Neighbors)", "a_knn_b_lca.png", ylim=None)
plot_learning_curve_ot(model, "Learning Curve (KNN Euclidean Distance & 2 Neighbors)", "a_knn_b_lce.png")
models_test(model, "KNN Euclidean Distance & 2 Neighbors", "a_knn_b_cm_test.png")
model = neighbors.KNeighborsClassifier(n_neighbors=2, weights='uniform', p=1)
models(model, "KNN Manhattan Distance & 2 Neighbors", "a_knn_b_cm2_train.png")
plot_roc(model, "ROC/AUC Score (KNN Manhattan Distance & 2 Neighbors", "a_knn_b2_roc.png")
plot_learning_curve(model, "Learning Curve (KNN Manhattan Distance & 2 Neighbors)", "a_knn_b2_lca.png", ylim=None)
plot_learning_curve_ot(model, "Learning Curve (KNN Manhattan Distance & 2 Neighbors)", "a_knn_b2_lce.png")
models_test(model, "KNN Manhattan Distance & 2 Neighbors", "a_knn_b2_cm_test.png")
results = []
for n in range(1, 51, 1):
clf = neighbors.KNeighborsClassifier(n_neighbors=n, weights="uniform", p=2)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
accuracy = np.where(preds==y_test, 1, 0).sum() / float(len(X_test))
print "Neighbors: %d, Accuracy: %3f" % (n, accuracy)
results.append([n, accuracy])
results = pd.DataFrame(results, columns=["n", "accuracy"])
plt.ylim([0,1])
plt.plot(results.n, results.accuracy)
plt.xlabel("Number of Neigbhours", fontdict=font)
plt.ylabel("Score", fontdict=font)
plt.title("Apple Tweets - KNN Accuracy with Increasing K", y=1.08, fontdict=font)
#plt.savefig("a_knn_neigbours.png", bbox_inches='tight')
plt.show()
model = neighbors.KNeighborsClassifier(n_neighbors=3, leaf_size=30, weights="uniform", p=2)
models(model, "KNN Euclidean Distance & 3 Neighbors", "a_knn_a_cm_train.png")
knn_roc, knn_roct = plot_roc(model, "ROC/AUC Score (KNN Euclidean Distance & 3 Neighbors)", "a_knn_a_roc.png")
plot_learning_curve(model, "Learning Curve (KNN Euclidean Distance & 3 Neighbors)", "a_knn_a_lca.png", ylim=(0,1.05))
plot_learning_curve_ot(model, "Learning Curve (KNN Euclidean Distance & 3 Neighbors)", "a_knn_a_lce.png")
models_test(model, "KNN Euclidean Distance & 3 Neighbors", "a_knn_a_cm_test.png")
knn_train_start = time.time()
model.fit(X,y)
knn_train_end = time.time()
knn_train_time = knn_train_end - knn_train_start
print "KNN Training time in seconds : %3f" % (knn_train_time)
print ""
knn_model = model
print knn_model
print ""
preds = knn_model.predict(X_test)
#knn_validation_accuracy = np.where(preds==y_test, 1, 0).sum() / float(len(X_test))
scores = cross_validation.cross_val_score(model, X, y, cv=10)
knn_validation_accuracy = scores.mean()
print "KNN Validation Set Accuracy: %3f" % (knn_validation_accuracy)
print ""
scores = cross_validation.cross_val_score(model, X, y, cv=10, scoring='f1')
knn_validation_f1 = scores.mean()
print "KNN Validation Set f1: %3f" % (knn_validation_f1)
print ""
knn_predict_start = time.time()
preds = knn_model.predict(Xt)
knn_predict_end = time.time()
knn_predict_time = knn_predict_end-knn_predict_start
print "KNN Prediction time in seconds : %3f" % (knn_predict_time)
print ""
knn_test_accuracy = np.where(preds==yt, 1, 0).sum() / float(len(Xt))
print "KNN Test Set Accuracy: %3f" % (knn_test_accuracy)
print ""
knn_test_f1 = metrics.f1_score(yt, preds, average="micro")
print "KNN Test Set f1 Score: %3f" % (knn_test_f1)
print ""
model = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
max_features=None, max_leaf_nodes=None, min_samples_leaf=2,
min_samples_split=2, min_weight_fraction_leaf=0.0,
random_state=123, splitter='best'), algorithm='SAMME', n_estimators=50, random_state=123 )
models(model, "AdaBoost Before Pruning", "a_ada_b_cm_train.png")
plot_roc(model, "ROC/AUC Score (AdaBoost Before Pruning)", "a_ada_b_roc.png")
plot_learning_curve(model, "Learning Curve (AdaBoost Before Pruning)", "a_ada_b_lca.png", ylim=(0,1.05))
plot_learning_curve_ot(model, "Learning Curve (AdaBoost Before Pruning)", "a_ada_b_lce.png")
models_test(model, "AdaBoost Before Pruning", "a_ada_b_cm_test.png")
model = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
max_features=None, max_leaf_nodes=None, min_samples_leaf=9,
min_samples_split=16, min_weight_fraction_leaf=0.0,
random_state=123, splitter='best'), algorithm='SAMME', n_estimators=100, random_state=123 )
models(model, "AdaBoost After Pruning", "a_ada_a_cm_train.png")
ada_roc, ada_roct = plot_roc(model, "ROC/AUC Score (AdaBoost After Pruning)", "a_ada_a_roc.png")
plot_learning_curve(model, "Learning Curve (AdaBoost After Pruning)", "a_ada_a_lca.png", ylim=(0,1.05))
plot_learning_curve_ot(model, "Learning Curve (AdaBoost After Pruning)", "a_ada_a_lce.png")
models_test(model, "AdaBoost After Pruning", "a_ada_a_cm_test.png")
ada_train_start = time.time()
model.fit(X,y)
ada_train_end = time.time()
ada_train_time = ada_train_end - ada_train_start
print "AdaBoost Training time in seconds : %3f" % (ada_train_time)
print ""
ada_model = model
print ada_model
print ""
preds = ada_model.predict(X_test)
#ada_validation_accuracy = np.where(preds==y_test, 1, 0).sum() / float(len(X_test))
scores = cross_validation.cross_val_score(model, X, y, cv=10)
ada_validation_accuracy = scores.mean()
print "AdaBoost Validation Set Accuracy: %3f" % (ada_validation_accuracy)
print ""
scores = cross_validation.cross_val_score(model, X, y, cv=10, scoring='f1')
ada_validation_f1 = scores.mean()
print "AdaBoost Validation Set f1: %3f" % (ada_validation_f1)
print ""
ada_predict_start = time.time()
preds = ada_model.predict(Xt)
ada_predict_end = time.time()
ada_predict_time = ada_predict_end-ada_predict_start
print "AdaBoost Prediction time in seconds : %3f" % (ada_predict_time)
print ""
ada_test_accuracy = np.where(preds==yt, 1, 0).sum() / float(len(Xt))
print "AdaBoost Test Set Accuracy: %3f" % (ada_test_accuracy)
print ""
ada_test_f1 = metrics.f1_score(yt, preds, average="micro")
print "AdaBoost Test Set f1 Score: %3f" % (ada_test_f1)
print ""
model = svm.SVC(probability=True, random_state=123)
models(model, "SVM RBF Kernel & penality parameter C=1", "a_svm_b_cm_train.png")
plot_roc(model, "ROC/AUC Score (SVM RBF Kernel & penality parameter C=1)", "a_svm_b_roc.png")
plot_learning_curve(model, "Learning Curve (SVM RBF Kernel & penality parameter C=1)", "a_svm_b_lca.png", ylim=(0,1.05))
plot_learning_curve_ot(model, "Learning Curve (SVM RBF Kernel & penality parameter C=1)", "a_svm_b_lce.png")
models_test(model, "SVM RBF Kernel & penality parameter C=1", "a_svm_b_cm_test.png")
model = svm.SVC(probability=True, random_state=123)
models(model, "SVM Poly Kernel & penality parameter C=1", "a_svm_b2_cm_train.png")
plot_roc(model, "ROC/AUC Score (SVM Poly Kernel & penality parameter C=1)", "a_svm_b2_roc.png")
plot_learning_curve(model, "Learning Curve (SVM Poly Kernel & penality parameter C=1)", "a_svm_b2_lca.png", ylim=(0,1.05))
plot_learning_curve_ot(model, "Learning Curve (SVM Poly Kernel & penality parameter C=1)", "a_svm_b2_lce.png")
models_test(model, "SVM Poly Kernel & penality parameter C=1", "a_svm_b2_cm_test.png")
model = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
kernel='linear', max_iter=-1, probability=True, random_state=123,
shrinking=True, tol=0.001, verbose=False)
models(model, "SVM Linear Kernel & penality parameter C=1", "a_svm_a_cm_train.png")
svm_roc, svm_roct = plot_roc(model, "ROC/AUC Score (SVM Linear Kernel & penality parameter C=1)", "a_svm_a_roc.png")
plot_learning_curve(model, "Learning Curve (SVM Linear Kernel & penality parameter C=1)", "a_svm_a_lca.png", ylim=(0,1.05))
plot_learning_curve_ot(model, "Learning Curve (SVM Linear Kernel & penality parameter C=1)", "a_svm_a_lce.png")
models_test(model, "SVM Linear Kernel & penality parameter C=1", "a_svm_a_cm_test.png")
svm_train_start = time.time()
model.fit(X,y)
svm_train_end = time.time()
svm_train_time = svm_train_end - svm_train_start
print "SVM Training time in seconds : %3f" % (svm_train_time)
print ""
svm_model = model
print svm_model
print ""
preds = svm_model.predict(X_test)
#svm_validation_accuracy = np.where(preds==y_test, 1, 0).sum() / float(len(X_test))
scores = cross_validation.cross_val_score(model, X, y, cv=10)
svm_validation_accuracy = scores.mean()
print "SVM Validation Set Accuracy: %3f" % (svm_validation_accuracy)
print ""
scores = cross_validation.cross_val_score(model, X, y, cv=10, scoring='f1')
svm_validation_f1 = scores.mean()
print "SVM Validation Set f1: %3f" % (svm_validation_f1)
print ""
svm_predict_start = time.time()
preds = svm_model.predict(Xt)
svm_predict_end = time.time()
svm_predict_time = svm_predict_end-svm_predict_start
print "SVM Prediction time in seconds : %3f" % (svm_predict_time)
print ""
svm_test_accuracy = np.where(preds==yt, 1, 0).sum() / float(len(Xt))
print "SVM Test Set Accuracy: %3f" % (svm_test_accuracy)
print ""
svm_test_f1 = metrics.f1_score(yt, preds, average="micro")
print "SVM Test Set f1 Score: %3f" % (svm_test_f1)
print ""
df_data = [{'Model' : 'Decision Tree', 'Test_Accuracy': dt_test_accuracy, 'Training_Accuracy':dt_validation_accuracy},
{'Model' : 'KNN', 'Test_Accuracy': knn_test_accuracy, 'Training_Accuracy':knn_validation_accuracy},
{'Model' : 'AdaBoost', 'Test_Accuracy': ada_test_accuracy, 'Training_Accuracy':ada_validation_accuracy},
{'Model' : 'SVM', 'Test_Accuracy': svm_test_accuracy, 'Training_Accuracy':svm_validation_accuracy}]
df_scores = pd.DataFrame(df_data)
df_scores = df_scores.sort('Test_Accuracy')
df_scores.head()
a = df_scores.Test_Accuracy.values
b = df_scores.Training_Accuracy.values
z = df_scores.Model.values
c = ["b", "r", "m", "c", "g"]
s = [60*3**n for n in range(len(a))]
fig, ax = plt.subplots()
ax.scatter(a,b,c=c,s=s, edgecolor=c, alpha=0.5)
ax.set_xlabel(r'Test Accuracy', fontsize=20, fontdict=font)
ax.set_ylabel(r'Validation Accuracy', fontsize=20, fontdict=font)
ax.set_title('Apple Twitter Sentiment - Test vs Training Accuracy', fontsize=30, fontdict=font)
ax.grid(True)
for i, txt in enumerate(a):
#ax.annotate(z[i], (a[i],b[i]))
if z[i] == 'Neural Net':
ha = "right"
xytext = (-60,60)
elif z[i] == 'Decision Tree' or z[i] == 'AdaBoost':
ha = "right"
xytext = (-20,40)
else:
ha = "left"
xytext = (30,-20)
ax.annotate(
z[i] + " " + str(round(a[i],3)) + " vs. " + str(round(b[i],3)),
xy = (a[i],b[i],), xytext = xytext,
textcoords = 'offset points' , ha = ha, va = "bottom",
bbox = dict(boxstyle = 'round,pad=0.3', fc = 'yellow', alpha = 0.6),
arrowprops = dict(arrowstyle = "fancy", connectionstyle = 'arc3,rad=0.1'))
fig.set_size_inches(18.0, 10.5)
fig.tight_layout()
#fig.savefig("ats_classifiers_performance.png", bbox_inches='tight')
fig.show()
df_f1_data = [{'Model' : 'Decision Tree', 'Test_f1': dt_test_f1, 'Training_f1':dt_validation_f1},
{'Model' : 'KNN', 'Test_f1': knn_test_f1, 'Training_f1':knn_validation_f1},
{'Model' : 'AdaBoost', 'Test_f1': ada_test_f1, 'Training_f1':ada_validation_f1},
{'Model' : 'SVM', 'Test_f1': svm_test_f1, 'Training_f1':svm_validation_f1}]
df_f1_scores = pd.DataFrame(df_f1_data)
df_f1_scores = df_f1_scores.sort('Test_f1')
df_f1_scores.head()
a = df_f1_scores.Test_f1.values
b = df_f1_scores.Training_f1.values
z = df_f1_scores.Model.values
c = ["b", "r", "m", "c", "g"]
s = [60*3**n for n in range(len(a))]
fig, ax = plt.subplots()
ax.scatter(a,b,c=c,s=s, edgecolor=c, alpha=0.5)
ax.set_xlabel(r'Test f1 Score', fontsize=20, fontdict=font)
ax.set_ylabel(r'Validation f1 Score', fontsize=20, fontdict=font)
ax.set_title('Apple Twitter Sentiment - Test vs Training f1 Scores', fontsize=30, fontdict=font)
ax.grid(True)
for i, txt in enumerate(a):
#ax.annotate(z[i], (a[i],b[i]))
if z[i] == 'Neural Net':
ha = "right"
xytext = (-60,60)
elif z[i] == 'Decision Tree' or z[i] == 'AdaBoost':
ha = "right"
xytext = (-20,40)
else:
ha = "left"
xytext = (30,-20)
ax.annotate(
z[i] + " " + str(round(a[i],3)) + " vs. " + str(round(b[i],3)),
xy = (a[i],b[i],), xytext = xytext,
textcoords = 'offset points' , ha = ha, va = "bottom",
bbox = dict(boxstyle = 'round,pad=0.3', fc = 'yellow', alpha = 0.6),
arrowprops = dict(arrowstyle = "fancy", connectionstyle = 'arc3,rad=0.1'))
fig.set_size_inches(18.0, 10.5)
fig.tight_layout()
#fig.savefig("ats_f1_performance.png", bbox_inches='tight')
fig.show()
df_time_data = [{'Model' : 'Decision Tree', 'Training_Time': dt_train_time, 'Prediction_Time':dt_predict_time},
{'Model' : 'KNN', 'Training_Time': knn_train_time, 'Prediction_Time':knn_predict_time},
{'Model' : 'AdaBoost', 'Training_Time': ada_train_time, 'Prediction_Time':ada_predict_time},
{'Model' : 'SVM', 'Training_Time': svm_train_time, 'Prediction_Time':svm_predict_time}]
df_times = pd.DataFrame(df_time_data)
df_times = df_times.sort('Training_Time')
df_times.head()
a = df_times.Training_Time.values
b = df_times.Prediction_Time.values
z = df_times.Model.values
c = ["b", "r", "m", "c", "g"]
s = [60*4**n for n in range(len(a))]
fig, ax = plt.subplots()
ax.scatter(a,b,c=c,s=s, edgecolor=c, alpha=0.5)
ax.set_xlabel(r'Training Time (seconds)', fontsize=20, fontdict=font)
ax.set_ylabel(r'Prediction Time (seconds)', fontsize=20, fontdict=font)
ax.set_title('Apple Twitter Sentiment - Training vs Prediction Time', fontsize=30, fontdict=font)
ax.grid(True)
for i, txt in enumerate(a):
#ax.annotate((z[i], a[i],b[i],
# arrowprops=dict(arrowstyle="->"))
if z[i] == 'SVM':
ha = "left"
xytext = (-60,60)
elif z[i] == 'Decision Tree' or z[i] == 'Neural Net':
ha = "right"
xytext = (-20,40)
else:
ha = "left"
xytext = (30,-20)
ax.annotate(
z[i] + " " + str(round(a[i],3)) + " vs. " + str(round(b[i],3)),
xy = (a[i],b[i],), xytext = xytext,
textcoords = 'offset points' , ha = ha, va = "bottom",
bbox = dict(boxstyle = 'round,pad=0.3', fc = 'yellow', alpha = 0.6),
arrowprops = dict(arrowstyle = "fancy", connectionstyle = 'arc3,rad=0.1'))
fig.set_size_inches(18.0, 10.5)
fig.tight_layout()
#fig.savefig("ats_classifiers_time.png", bbox_inches='tight')
fig.show()
df_roc_data = [{'Model' : 'Decision Tree', 'Test_ROC': dt_roct, 'Training_ROC':dt_roc},
{'Model' : 'KNN', 'Test_ROC': knn_roct, 'Training_ROC':knn_roc},
{'Model' : 'AdaBoost', 'Test_ROC': ada_roct, 'Training_ROC':ada_roc},
{'Model' : 'SVM', 'Test_ROC': svm_roct, 'Training_ROC':svm_roc}]
df_rocs = pd.DataFrame(df_roc_data)
df_rocs = df_rocs.sort('Test_ROC')
df_rocs.head()
a = df_rocs.Test_ROC.values
b = df_rocs.Training_ROC.values
z = df_rocs.Model.values
c = ["b", "r", "m", "c", "g"]
s = [60*4**n for n in range(len(a))]
fig, ax = plt.subplots()
ax.scatter(a,b,c=c,s=s, edgecolor=c, alpha=0.5)
ax.set_xlabel(r'Test AUC Score', fontsize=20, fontdict=font)
ax.set_ylabel(r'Validation AUC Score', fontsize=20, fontdict=font)
ax.set_title('Apple Twitter Sentiment - Test vs Training AUC Score', fontsize=30, fontdict=font)
ax.grid(True)
for i, txt in enumerate(a):
#ax.annotate((z[i], a[i],b[i],
# arrowprops=dict(arrowstyle="->"))
if z[i] == 'SVM':
ha = "left"
xytext = (-60,60)
elif z[i] == 'Decision Tree':
ha = "right"
xytext = (-20,40)
else:
ha = "left"
xytext = (30,-20)
ax.annotate(
z[i] + " " + str(round(a[i],3)) + " vs. " + str(round(b[i],3)),
xy = (a[i],b[i],), xytext = xytext,
textcoords = 'offset points' , ha = ha, va = "bottom",
bbox = dict(boxstyle = 'round,pad=0.3', fc = 'yellow', alpha = 0.6),
arrowprops = dict(arrowstyle = "fancy", connectionstyle = 'arc3,rad=0.1'))
fig.set_size_inches(18.0, 10.5)
fig.tight_layout()
fig.show()